;************************************************************************
;*
;* Copyright:
;*	Freescale Semiconductor, INC. All Rights Reserved.  
;*  You are hereby granted a copyright license to use, modify, and
;*  distribute the SOFTWARE so long as this entire notice is
;*  retained without alteration in any modified and/or redistributed
;*  versions, and that such modified versions are clearly identified
;*  as such. No licenses are granted by implication, estoppel or
;*  otherwise under any patents or trademarks of Freescale Semiconductor, 
;*  Inc. This software is provided on an "AS IS" basis and without warranty.
;*
;*  To the maximum extent permitted by applicable law, FREESCALE 
;*  DISCLAIMS ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING 
;*  IMPLIED WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
;*  PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH REGARD TO THE 
;*  SOFTWARE (INCLUDING ANY MODIFIED VERSIONS THEREOF) AND ANY 
;*  ACCOMPANYING WRITTEN MATERIALS.
;* 
;*  To the maximum extent permitted by applicable law, IN NO EVENT
;*  SHALL FREESCALE BE LIABLE FOR ANY DAMAGES WHATSOEVER (INCLUDING 
;*  WITHOUT LIMITATION, DAMAGES FOR LOSS OF BUSINESS PROFITS, BUSINESS 
;*  INTERRUPTION, LOSS OF BUSINESS INFORMATION, OR OTHER PECUNIARY
;*  LOSS) ARISING OF THE USE OR INABILITY TO USE THE SOFTWARE.   
;* 
;*  Freescale assumes no responsibility for the maintenance and support
;*  of this software
;*************************************************************************
;*
;*  FILE NAME: iir16.s
;*
;*  PURPOSE:  IIR16 routines definition
;*
;*  AUTHOR: Andrey Butok
;*		IIR16 function optimized for eMAC by Igor Drozdinsky
;*
;***********************************************************************
.text
;#define __EMAC_H
;#include "emac.h"
;#ifdef __FRACT_M
#define __FMAC_SR16 0x00000070
;#else
;#define __FMAC_SR16 0x00000000
;#endif

	.XDEF	_IIR16_EMAC
	
	.extern	_malloc
	.extern	_free
	
	
;******************************************************
;* NAME: void IIR16( struct tIirStruct *pIIR, Frac16* pX, Frac16* pY, unsigned int n)
;*
;* DESCRIPTION: Computes a Infinite Impulse Response (IIR) filter for a array of 16-bit fractional data values.
;******************************************************
;* Used registers:
;*	a2       pIIR	- Pointer to a data structure containing private data for the IIR filter
;*	d0, d1	 iCurY0, iCurY1, iCurY2, iCurY3	- Curent Y
;*	d2, d3	 iCurX0, iCurX1, iCurX2, iCurX3	- Curent X
;*	d4		 iCurA, iCurB	- Curent coefficients
;*	d5		 j	- inner loop counter
;*	d7		 i	- outer loop counter
;*	d6		 tmp	- saved value for inner loop counter calculation
;*	a0       pX		- Pointer to the current X for outer loop
;*	a1       pY		- Pointer to the current Y for outer loop
;*	a4       pCurX	- Pointer to the current X for inner loop
;*	a4       pCurHistory	- Pointer to the current element of history buffer
;*	a5       pPredY  	- Pointer to the previous Y for inner loop
;*	a3       pCurCoef	- Pointer to the current coefficient
;*	a6       pIirCoef - Pointer to the coefficients -> {a0,a1,b1,a2,b2...}
;*	ACC0     iOut0	- Accumulator 0
;*	ACC1     iOut1	- Accumulator 1
;*	ACC2     iOut2	- Accumulator 2
;*	ACC3     iOut3	- Accumulator 3
;******************************************************
_IIR16_EMAC:

;---=Saving values of used registers=---
	lea -64(a7), a7
	movem.l d0-d7/a0-a6, (a7)
	
;--== Saving old MAC status register to the stack ==--
	move.l	MACSR, d0
	move.l	d0, 60(a7)
;--== Loading new MAC status register ==--
	move.l	#__FMAC_SR16, d0
	move.l	d0, MACSR
	
;---=Most useful parameters are moved from stack to registers.=---	
	move.l	72(a7), a0			;pX
	move.l	76(a7), a1			;pY
	move.l	68(a7), a2			;pIIR
	move.l	(a2), a6			;pIirCoef = pIIR->pIirCoef
	move.l	#0, ACC0
	move.l	#0, ACC1
	move.l	#0, ACC2
	move.l	#0, ACC3		
;---====== Begin of cycle of getting Y[1]..Y[N] (N = (pIIR->iIirCoefCount - 1) / 2)======---

	move.l	4(a2), d7			;
	subq.l	#1, d7				;tmp = i = pIIR->iIirCoefCount - 1;
	move.l	d7, d6				;
	
	
	asr.l	#3, d7
			
	beq		.EndD4Z				;if(i >> 3)
								;{
 		
;--==First N output samples computation==--
		
	move.l	d6, d7
	subq.l	#8, d7				;i -= 8;

;--==Computation without using history buffer==--
		
.BegD4:								;do{
			
;--== Next input samples loading	==--
	move.l	(a0), d2			;iCurX0 = pX[0]; iCurX1 = pX[1];
	move.l	4(a0), d3			;iCurX2 = pX[2]; iCurX3 = pX[3];
	movea.l	a0, a4				;pCurX = pX;
	lea		8(a0), a0			;pX += 4;
	
;--== Make four previous output samples	zero==--
	clr.l	d0					;iCurY0 = iCurY1 = 0;
	clr.l	d1					;iCurY2 = iCurY3 = 0;
	movea.l	a1, a5				;pCurY = pY;
	
;--== Next coefficient loading	==--
	move.l	a6, a3				;pCurCoef = pIirCoef;
	move.l	(a3), d4			;iCurA = *pCurCoef;
	lea		2(a3), a3			;pCurCoef++;
	
;--== Input samples and first coefficient multiplications ==--
	mac.w	d4.u, d2.u, <<, ACC0	;iOut0 = iCurA * iCurX0;
	mac.w	d4.u, d2.l, <<, ACC1	;iOut1 = iCurA * iCurX1;
	mac.w	d4.u, d3.u, <<, ACC2	;iOut2 = iCurA * iCurX2;
	mac.w	d4.u, d3.l, <<, ACC3	;iOut3 = iCurA * iCurX3;

	
;--== The count of inner loops calculation ==-- 
	move.l	d6, d5				;
	sub.l	d7, d5				;j = tmp - i - 8;
	subq.l	#8, d5				;
								
	ble		.EndIn1				;while(j > 0)
								;{
	
.ForIn1Beg:
;--== Next coefficients loading	==--
	move.l	(a3)+, d4			;iCurA = *pCurCoef++; iCurB = *pCurCoef++;

.ForIn1:
	
;--== Three first input samples and coefficients multiplication ==--
	mac.w	d4.u, d2.u, <<, ACC1	;iOut1 += iCurA * iCurX0;
	mac.w	d4.u, d2.l, <<, ACC2 	;iOut2 += iCurA * iCurX1;
	mac.w	d4.u, d3.u, <<, ACC3	;iOut3 += iCurA * iCurX2;

;--== Three first output samples and coefficients multiplication ==--	
	mac.w	d4.l, d0.u, <<, ACC1	;iOut1 += iCurB * iCurY0;
	mac.w	d4.l, d0.l, <<, ACC2	;iOut2 += iCurB * iCurY1;
	mac.w	d4.l, d1.u, <<, ACC3	;iOut3 += iCurB * iCurY2;
	
	
;--== Loading two next input samples ==--	
	move.l	d2, d3					;iCurX3 = iCurX1; 	iCurX2 = iCurX0;
	move.l	-(a4), d2				;iCurX1 = *--pCurX; iCurX0 = *--pCurX;
;--== Loading two next output samples ==--	
	move.l	d0, d1					;iCurY3 = iCurY1; 	iCurY2 = iCurY0;
	move.l	-(a5), d0				;iCurY1 = *--pCurY; iCurY0 = *--pCurY;
	
	
;--== Fourth input sample and coefficient multiplication ==--
	mac.w	d4.u, d2.l, <<, ACC0	;iOut0 += iCurA * iCurX1;
;--== Fourth output sample and coefficient multiplication ==--
	mac.w	d4.l, d0.l, <<, ACC0	;iOut0 += iCurB * iCurY1;
	
;--== Next coefficients loading	==--
	move.l	(a3)+, d4				;iCurA = *pCurCoef++; iCurB = *pCurCoef++;
	
;--== Next four input sample and coefficients multiplication ==--
	mac.w	d4.u, d2.u, <<, ACC0	;iOut0 += iCurA * iCurX0;
	mac.w	d4.u, d2.l, <<, ACC1	;iOut1 += iCurA * iCurX1;
	mac.w	d4.u, d3.u, <<, ACC2	;iOut2 += iCurA * iCurX2;
	mac.w	d4.u, d3.l, <<, ACC3	;iOut3 += iCurA * iCurX3;
	
;--== Next four output sample and coefficients multiplication ==--
	mac.w	d4.l, d0.u, <<, ACC0			;iOut0 += iCurB * iCurY0;
	mac.w	d4.l, d0.l, <<, ACC1			;iOut1 += iCurB * iCurY1;
	mac.w	d4.l, d1.u, <<, ACC2			;iOut2 += iCurB * iCurY2;
;--== Last mac instruction with next coefficients loading ==--
	mac.w	d4.l, d1.l, <<, (a3)+, d4, ACC3	;iOut3 += iCurB * iCurY3; 
											;iCurA = *pCurCoef++; iCurB = *pCurCoef++;
	
	subq.l	#4, d5 					;	j -= 4;
	bgt		.ForIn1					;} //while(j > 0)
	
.EndIn1C:	
;--== Correcting pointer to coefficients after inner loop ==--
	lea		-4(a3), a3				;pCurCoef -= 4
	
.EndIn1:	
	
	move.l	12(a2), d5				;if(pIIR->iIirHistoryCount)
;---=Computation using history buffer==--
	beq		.BegNoHistory			;{

;--== The count of inner loops calculation ==-- 
	move.l	d7, d5					;
	addq.l	#8, d5					;j = i + 8;

	ble		.EndInH					;if(j > 0)
									;{

;--== Setting curent history buffer pointer ==--
	move.l  8(a2), a4				;
	adda.l	d6, a4					;
	adda.l	d6, a4					;pCurHistory = pIIR->pIirHistory + tmp * 2;
	

	addq.l	#1, d5					;
	btst	#1, d5					;if((j + 1) & 2)
	beq		.ForInHBeg				;{
;--== This part is executed only if (j + 1)/2 is odd ==--	
	
	subq.l	#1, d5
	
;--== Next coefficients loading	==--
	move.l	(a3)+, d4				;iCurA = *pCurCoef++; iCurB = *pCurCoef++;
	
;--== Next input sample loading	from history buffer==--
	move.w	d2, d3					;iCurX3 = iCurX2;
	move.w	-(a4), d2				;iCurX2 = iCurX1
	swap	d2						;iCurX1 = iCurX0
	swap	d3						;iCurX0 = *--pCurHistory;
	
;--== Next output sample loading from history buffer==--
	move.w	d0, d1					;iCurY3 = iCurY2;
	move.w	-(a4), d0				;iCurY2 = iCurY1;
	swap	d0						;iCurY1 = iCurY0;
	swap	d1						;iCurY0 = *--pCurHistory;
	
	mac.w	d4.u, d2.u, <<, ACC0	;iOut0 += iCurA * iCurX0;
	mac.w	d4.u, d2.l, <<, ACC1	;iOut1 += iCurA * iCurX1;
	mac.w	d4.u, d3.u, <<, ACC2	;iOut2 += iCurA * iCurX2;
	mac.w	d4.u, d3.l, <<, ACC3	;iOut3 += iCurA * iCurX3;
	
	mac.w	d4.l, d0.u, <<, ACC0	;iOut0 += iCurB * iCurY0;
	mac.w	d4.l, d0.l, <<, ACC1	;iOut1 += iCurB * iCurY1;
	mac.w	d4.l, d1.u, <<, ACC2	;iOut2 += iCurB * iCurY2;
	mac.w	d4.l, d1.l, <<, ACC3	;iOut3 += iCurB * iCurY3;
	
	subq.l	#2, d5					;j -= 2;
	ble		.EndInH					;}
									;while(j > 0)
									;{
	
.ForInHBeg:								
	subq.l	#1, d5
;--== Next coefficients loading	==--
	move.l	(a3)+, d4				;iCurA = *pCurCoef++; iCurB = *pCurCoef++;

.ForInH:
		
;--== Three first input samples and coefficients multiplication ==--
	mac.w	d4.u, d2.u, <<, ACC1	;iOut1 += iCurA * iCurX0
	mac.w	d4.u, d2.l, <<, ACC2	;iOut2 += iCurA * iCurX1
	mac.w	d4.u, d3.u, <<, ACC3	;iOut3 += iCurA * iCurX2
	
;--== Three first output samples and coefficients multiplication ==--
	mac.w	d4.l, d0.u, <<, ACC1	;iOut1 += iCurB * iCurY0
	mac.w	d4.l, d0.l, <<, ACC2	;iOut2 += iCurB * iCurY1
	mac.w	d4.l, d1.u, <<, ACC3	;iOut3 += iCurB * iCurY2
	
	
;--== Loading two next input samples ==--	
	move.l	d2, d3					;iCurX3 = iCurX1; 	iCurX2 = iCurX0;
	move.l	-(a4), d2				;iCurX1 = *--pCurHistory; iCurX0 = *--pCurHistory;
;--== Loading two next output samples ==--	
	move.l	d0, d1					;iCurY3 = iCurY1; 	iCurY2 = iCurY0;
	move.l	-(a4), d0				;iCurY1 = *--pCurHistory; iCurY0 = *--pCurHistory;
			
	
;--== Fourth input sample and coefficient multiplication ==--
	mac.w	d4.u, d2.l, <<, ACC0	;iOut0 += iCurA * iCurX0
;--== Fourth output sample and coefficient multiplication ==--
	mac.w	d4.l, d2.u, <<, ACC0	;iOut0 += iCurB * iCurY0

;--== Correcting input and output samples loaded from history buffer ==--	
	swap	d2						;
	move.w	d2, d4					;
	move.w	d0, d2					;
	move.w	d4, d0					;
	swap	d2						;
	
;--== Next coefficients loading	==--
	move.l	(a3)+, d4				;iCurA = *pCurCoef++; iCurB = *pCurCoef++;
	
;--== Next Four input sample and coefficients multiplication ==--
	mac.w	d4.u, d2.u, <<, ACC0	;iOut0 += iCurA * iCurX0
	mac.w	d4.u, d2.l, <<, ACC1	;iOut1 += iCurA * iCurX1
	mac.w	d4.u, d3.u, <<, ACC2	;iOut2 += iCurA * iCurX2
	mac.w	d4.u, d3.l, <<, ACC3	;iOut3 += iCurA * iCurX3
	
;--== Next Four output sample and coefficients multiplication ==--
	mac.w	d4.l, d0.u, <<, ACC0			;iOut0 += iCurB * iCurY0
	mac.w	d4.l, d0.l, <<, ACC1			;iOut1 += iCurB * iCurY1
	mac.w	d4.l, d1.u, <<, ACC2			;iOut2 += iCurB * iCurY2
	mac.w	d4.l, d1.l, <<, (a3)+, d4, ACC3	;iOut3 += iCurB * iCurY3
											;iCurA = *pCurCoef++; iCurB = *pCurCoef++;
	
	subq.l	#4, d5					;j -= 4;
	bgt		.ForInH					;} // while(j > 0)
	
.EndInH:	
	bra		.EndH					;} // if(pIIR->iIirHistoryCount)
									;else{

.BegNoHistory:

;---=Final computations if history buffer is void==--
;--== Next coefficients loading	==--
	move.l	(a3)+, d4				;iCurA = *pCurCoef++; iCurB = *pCurCoef++;
	
	mac.w	d4.u, d2.u, <<, ACC1	;iOut1 += iCurA * iCurX1
	mac.w	d4.u, d2.l, <<, ACC2	;iOut2 += iCurA * iCurX2
	mac.w	d4.u, d3.u, <<, ACC3	;iOut3 += iCurA * iCurX3
	
	mac.w	d4.l, d0.u, <<, ACC1	;iOut1 += iCurB * iCurY1
	mac.w	d4.l, d0.l, <<, ACC2	;iOut2 += iCurB * iCurY2
	mac.w	d4.l, d1.u, <<, ACC3	;iOut3 += iCurB * iCurY3
	
;--== Next coefficients loading	==--
	move.l	(a3)+, d4				;iCurA = *pCurCoef++; iCurB = *pCurCoef++;
	
	mac.w	d4.u, d2.u, <<, ACC2	;iOut2 += iCurA * iCurX2
	mac.w	d4.u, d2.l, <<, ACC3	;iOut3 += iCurA * iCurX3
	
	mac.w	d4.l, d0.u, <<, ACC2	;iOut2 += iCurB * iCurY2
	mac.w	d4.l, d0.l, <<, ACC3	;iOut3 += iCurB * iCurY3
	
;--== Next coefficients loading	==--
	move.l	(a3)+, d4				;iCurA = *pCurCoef++; iCurB = *pCurCoef++;
	
	mac.w	d4.u, d2.u, <<, ACC3	;iOut3 += iCurA * iCurX3
	
	mac.w	d4.l, d0.u, <<, ACC3	;iOut3 += iCurB * iCurY3
									;} // else if (pIIR->iIirHistoryCount)
	
.EndH:

;--==Storing computed samples into the memory==--
				
	move.l	a6, a3					;pCurCoef = pIirCoef
	move.l	4(a3), d4				;pCurB = pIirCoef[2];
	move.w	8(a3), d4				;pCurA = pIirCoef[4];
	move.l	12(a3), d5				;j = pIirCoef[6];
	
	movclr.l	ACC0, d0			;*pY++ = iOut0
;#ifndef __FRACT_M	
;	swap	d0						;
;#endif	
	move.w	d0, (a1)+				;
			
	mac.w	d4.u, d0.l, <<, ACC1	;iOut1 += iOut0 * pCurB
	
	movclr.l	ACC1, d1			;
;#ifndef __FRACT_M	
;	swap	d1						;*pY++ = iOut1
;#endif	
	move.w	d1, (a1)+				;
	
	mac.w	d4.u, d1.l, <<, ACC2	;iOut2 += iOut1 * pCurA
	mac.w	d4.l, d0.l, <<, ACC2	;iOut2 += iOut0 * pCurB
	
	movclr.l	ACC2, d2			;
;#ifndef __FRACT_M	
;	swap	d2						;*pY++ = iOut2
;#endif	
	move.w	d2, (a1)+				;
	
	mac.w	d4.u, d2.l, <<, ACC3	;iOut3 += iOut2 * pCurA
	mac.w	d4.l, d1.l, <<, ACC3	;iOut3 += iOut1 * pCurB
	mac.w	d5.u, d0.l, <<, ACC3	;iOut3 += iOut0 * j
	
	movclr.l	ACC3, d0			;
;#ifndef __FRACT_M	
;	swap	d0						;*pY++ = iOut3
;#endif	
	move.w	d0, (a1)+				;

	subq.l	#8, d7					;i -= 8;
	bgt		.BegD4					;}while(i > 0);
	
	bra		.EndD4

.EndD4Z:
	
	move.l	d6, d7					;
	beq		.EndTailH				;
	bra		.BegTailH				;
	
.EndD4:

	addq.l	#8, d7					; i += 8;
	
	beq		.EndTailH				;while(i)
									;{
	
;---=Computation of N % 4 last output samples==--
.BegTailH:
	
;--== Next input samples loading	==--
	move.w	(a0), d2				;iCurX0 = *pX;
	movea.l	a0, a4					;pCurX = pX;
	lea		2(a0), a0				;pX++;
	
;--== Setting output samples pointer ==--
	movea.l	a1, a5					;pCurY = pY;
			
;--== Next coefficients loading	==--
	move.l	a6, a3					;pCurCoef = pIirCoef;
	move.l	(a3), d4				;iCurA = *pCurCoef;
	lea		2(a3), a3				;pCurCoef++;
	
;--== Input samples and first coefficient multiplications ==--
	mac.w	d4.u, d2.l, <<, ACC0	;iOut0 = iCurA * iCurX0
	
;--== The count of inner loops calculation ==-- 
	move.l	d6, d5					;
	sub.l	d7, d5					;j = tmp - i;

	ble		.EndIn1TailH			;while(j > 0)
									;{

.ForIn1TailH:
	
;--== Next coefficients loading	==--
	move.l	(a3)+, d4				;iCurA = *pCurCoef++; iCurB = *pCurCoef++;
	
;--== Next input sample loading	==--
	move.w	-(a4), d2				;iCurX0 = *--pCurX;
;--== Next output sample loading	==--
	move.w	-(a5), d0				;iCurY0 = *--pCurY;
	
;--== Two multiplications for each output sample ==--
	mac.w	d4.u, d2.l, <<, ACC0	;iOut0 += iCurA * iCurX0
	mac.w	d4.l, d0.l, <<, ACC0	;iOut0 += iCurB * iCurY0
	
	subq.l	#2, d5 					;j-= 2;
	bgt		.ForIn1TailH			;} // while(j > 0)
	
.EndIn1TailH:	
	
	move.l	12(a2), d5				;if(pIIR->iIirHistoryCount)
	beq		.EndInHTail				;{

;--== The count of inner loops calculation ==-- 
	move.l	d7, d5					;j = i

	ble		.EndInHTail				;if(j > 0)
									;{

;--== Current history buffer pointer initialization==--
	move.l  8(a2), a4				;pCurHustory = pIirHistory + tmp * 2
	adda.l	d6, a4					;
	adda.l	d6, a4					;while(j > 0)
									;{
	
.ForInHTail:
	
;--== Next coefficients loading	==--
	move.l	(a3)+, d4				;iCurA = *pCurCoef++; iCurB = *pCurCoef++;
	
;--== Next input sample loading	==--
	move.w	-(a4), d2				;iCurX0 = *--pCurHistory;
;--== Next output sample loading	==--
	move.w	-(a4), d0				;iCurY0 = *--pCurHistory;
	
;--== Two multiplications for each output sample ==--
	mac.w	d4.u, d2.l, <<, ACC0	;iOut0 += iCurA * iCurX0
	mac.w	d4.l, d0.l, <<, ACC0	;iOut0 += iCurB * iCurY0
	
	subq.l	#2, d5					;j -= 2;
	bgt		.ForInHTail				;}//while (j > 0)
	
.EndInHTail:							;}//if(pIIR->iIirHistoryCount)

	
;--==Storing computed sample into the memory==--
	
	movclr.l	ACC0, d0			;
;#ifndef __FRACT_M	
;	swap	d0						;*pY++ = iOut0;
;#endif	
	move.w	d0, (a1)+				;
	
	subq.l	#2, d7					;i -= 2;
	bgt		.BegTailH				;}
	
.EndTailH:								;}//if(i)

;---====== Begin of cycle of getting Y[N+1]..Y[n] (N = (pIIR->iIirCoefCount - 1) / 2)======---
	move.l	4(a2), d6				;tmp = IIR->iIirCoefCount;
	
	move.l	80(a7), d7				;i = n;
	
	subq.l	#1, d6					;tmp = (tmp - 1) / 2;
	asr.l	#1, d6					;
	sub.l	d6, d7					;i -= tmp;
	 		
	asr.l	#2, d7					; i /= 4
	
	beq		.BegTail2				;while(i != 0)
									;{
;--==Y[N+1]..Y[k] computation, where k is (N + 1) + ((n - N - 1) % 4)
		
.BegD4E:								;Begin of inner loop 
	
;--== Next input samples loading ==--
	move.l	(a0), d2				;iCurX0 = pX[0]; iCurX1 = pX[1];
	move.l	4(a0), d3				;iCurX2 = pX[2]; iCurX3 = pX[3];
	movea.l	a0, a4					;pCurX = pX;
	lea		8(a0), a0				;pX += 4;
	
;--== Make four previous output samples	zero==--
	clr.l	d0						;iCurY0 = iCurY1 = 0;
	clr.l	d1						;iCurY2 = iCurY3 = 0;
	movea.l	a1, a5					;pCurY = pY;
	
;--== Next coefficient loading	==--
	move.l	a6, a3					;pCurCoef = pIirCoef;
	move.l	(a3), d4				;iCurA = *pCurCoef;
	lea		2(a3), a3				;pCurCoef++;
	
;--== Input samples and first coefficient multiplications ==--
	mac.w	d4.u, d2.u, <<, ACC0	;iOut0 = iCurA * iCurX0
	mac.w	d4.u, d2.l, <<, ACC1	;iOut1 = iCurA * iCurX1
	mac.w	d4.u, d3.u, <<, ACC2	;iOut2 = iCurA * iCurX2
	mac.w	d4.u, d3.l, <<, ACC3	;iOut3 = iCurA * iCurX3
	
;--== The count of inner loops calculation ==-- 
	move.l	d6, d5					;
	btst	#0, d5					;j = tmp - i - 8;
	beq		.ForIn1EBeg				;if(j & 1)
									;{
	
;--== This multiplications executed only when j is odd ==--

;--== Next coefficient loading	==--
	move.l	(a3)+, d4				;iCurA = *pCurCoef++; iCurB = *pCurCoef++;
	
;--== Next input sample loading	==--
	move.w	d2, d3					;iCurX3 = iCurX2;
	move.w	-(a4), d2				;iCurX2 = iCurX1;
	swap	d2						;iCurX1 = iCurX0;
	swap	d3						;iCurX0 = *--pCurHistory;
	
;--== Next output sample loading	==--
	move.w	d0, d1					;iCurY3 = iCurY2;
	move.w	-(a5), d0				;iCurY2 = iCurY1;
	swap	d0						;iCurY1 = iCurY0;
	swap	d1						;iCurY0 = *--pCurHistory;
	
	mac.w	d4.u, d2.u, <<, ACC0	;iOut0 += iCurA * iCurX0
	mac.w	d4.u, d2.l, <<, ACC1	;iOut1 += iCurA * iCurX1
	mac.w	d4.u, d3.u, <<, ACC2	;iOut2 += iCurA * iCurX2
	mac.w	d4.u, d3.l, <<, ACC3	;iOut3 += iCurA * iCurX3
	
	mac.w	d4.l, d0.u, <<, ACC0	;iOut0 += iCurB * iCurY0
	mac.w	d4.l, d0.l, <<, ACC1	;iOut1 += iCurB * iCurY1
	mac.w	d4.l, d1.u, <<, ACC2	;iOut2 += iCurB * iCurY2
	mac.w	d4.l, d1.l, <<, ACC3	;iOut3 += iCurB * iCurY3
	
	subq.l	#1, d5					;j -= 2;
	beq		.EndIn1E				;}

.ForIn1EBeg:

;--== Next coefficients loading	==--
	move.l	(a3)+, d4				;iCurA = *pCurCoef++; iCurB = *pCurCoef++;

.ForIn1E:
	subq.l	#2, d5					;for(;j >= 0; j -=2)
	blt		.EndIn1E				;{
	
	
;--== Three first input samples and coefficients multiplication ==--
	mac.w	d4.u, d2.u, <<, ACC1	;iOut1 += iCurA * iCurX0
	mac.w	d4.u, d2.l, <<, ACC2	;iOut2 += iCurA * iCurX1
	mac.w	d4.u, d3.u, <<, ACC3	;iOut3 += iCurA * iCurX2
	
;--== Three first output samples and coefficients multiplication ==--
	mac.w	d4.l, d0.u, <<, ACC1	;iOut1 += iCurB * iCurY0
	mac.w	d4.l, d0.l, <<, ACC2	;iOut2 += iCurB * iCurY1
	mac.w	d4.l, d1.u, <<, ACC3	;iOut3 += iCurB * iCurY2
	
	
;--== Loading two next input samples ==--	
	move.l	d2, d3					;iCurX3 = iCurX1; 	iCurX2 = iCurX0;
	move.l	-(a4), d2				;iCurX1 = *--pCurX; iCurX0 = *--pCurX;
;--== Loading two next output samples ==--	
	move.l	d0, d1					;iCurY3 = iCurY1; 	iCurY2 = iCurY0;
	move.l	-(a5), d0				;iCurY1 = *--pCurY; iCurY0 = *--pCurY;
	
	
;--== Fourth input sample and coefficient multiplication ==--
	mac.w	d4.u, d2.l, <<, ACC0	;iOut0 += iCurA * iCurX1
;--== Fourth output sample and coefficient multiplication ==--
	mac.w	d4.l, d0.l, <<, ACC0	;iOut0 += iCurB * iCurY1
	
;--== Next coefficients loading	==--
	move.l	(a3)+, d4				;iCurA = *pCurCoef++; iCurB = *pCurCoef++;
	

;--== Next Four input sample and coefficients multiplication ==--
	mac.w	d4.u, d2.u, <<, ACC0	;iOut0 += iCurA * iCurX0
	mac.w	d4.u, d2.l, <<, ACC1	;iOut1 += iCurA * iCurX1
	mac.w	d4.u, d3.u, <<, ACC2	;iOut2 += iCurA * iCurX2
	mac.w	d4.u, d3.l, <<, ACC3	;iOut3 += iCurA * iCurX3
	
;--== Next Four output sample and coefficients multiplication ==--
	mac.w	d4.l, d0.u, <<, ACC0			;iOut0 += iCurB * iCurY0
	mac.w	d4.l, d0.l, <<, ACC1			;iOut1 += iCurB * iCurY1
	mac.w	d4.l, d1.u, <<, ACC2			;iOut2 += iCurB * iCurY2
;--== Last mac instruction with next coefficients loading ==--
	mac.w	d4.l, d1.l, <<, (a3)+, d4, ACC3	;iOut3 += iCurB * iCurY3
											;iCurA = *pCurCoef++; iCurB = *pCurCoef++;
	
	bra		.ForIn1E				;}
	
.EndIn1E:	
	
;--==Storing computed samples into the memory==--
	
	move.l	d6, d3
	move.l	a6, a3					;pCurCoef = pIirCoef;

	subq.l	#1, d3
	blt		.NextPre
	move.l	4(a3), d4				;pCurB = pIirCoef[2];
	
	subq.l	#1, d3
	blt		.NextPre
	move.w	8(a3), d4				;pCurA = pIirCoef[4];
	
	subq.l	#1, d3
	blt		.NextPre
	move.l	12(a3), d5				;j = pIirCoef[6];

.NextPre:
	
	move.l	d6, d3
	
	movclr.l	ACC0, d0			;	
;#ifndef __FRACT_M	
;	swap	d0						;*pY++ = iOut0
;#endif	
	move.w	d0, (a1)+				;
	
	subq.l	#1, d3
	blt		.NextY1
	mac.w	d4.u, d0.l, <<, ACC1	;iOut1 += iCurB * iOut0
	
.NextY1:
	
	move.l	d6, d3
	
	movclr.l	ACC1, d1			;
;#ifndef __FRACT_M	
;	swap	d1						;*pY++ = iOut1
;#endif	
	move.w	d1, (a1)+				;
	
	subq.l	#1, d3
	blt		.NextY2
	mac.w	d4.u, d1.l, <<, ACC2	;iOut2 += iCurB * iOut1
	
	subq.l	#1, d3
	blt		.NextY2
	mac.w	d4.l, d0.l, <<, ACC2	;iOut2 += iCurA * iOut0
	
.NextY2:

	move.l	d6, d3
	
	movclr.l	ACC2, d2			;
;#ifndef __FRACT_M	
;	swap	d2						;*pY++ = iOut2
;#endif	
	move.w	d2, (a1)+				;
	
	subq.l	#1, d3
	blt		.NextY3
	mac.w	d4.u, d2.l, <<, ACC3	;iOut3 += iCurB * iOut2
	
	subq.l	#1, d3
	blt		.NextY3
	mac.w	d4.l, d1.l, <<, ACC3	;iOut3 += iCurA * iOut1
	
	subq.l	#1, d3
	blt		.NextY3
	mac.w	d5.u, d0.l, <<, ACC3	;iOut3 += j * iOut0

.NextY3:		

	movclr.l	ACC3, d0			;
;#ifndef __FRACT_M	
;	swap	d0						;*pY++ = iOut3
;#endif	
	move.w	d0, (a1)+				;

	subq.l	#1, d7					;i --;
	bne		.BegD4E					;}
	
.EndD4E:
;---=Computation of N % 4 last output samples==--
.BegTail2:

	move.l	4(a2), d6				;tmp = IIR->iIirCoefCount;
	move.l	80(a7), d7				;i = n;
	
	subq.l	#1, d6					;tmp = (tmp - 1) / 2;
	asr.l	#1, d6					;
	sub.l	d6, d7					;i -= tmp;
	 		
	andi.l	#3, d7					;i &= 3;
	
	beq		.EndTail2				;while(i)
									;{
.BegOutTail2:
	
;--== Next input samples loading	==--
	move.w	(a0), d2				;iCurX0 = pX;
	movea.l	a0, a4					;pCurX = pX;
	lea		2(a0), a0				;pX++;
	
	movea.l	a1, a5					;pCurY = pY;
	
;--== Next coefficient loading	==--
	move.l	a6, a3					;pCurCoef = pIirCoef;
	move.l	(a3), d4				;iCurA = *pCurCoef;
	lea		2(a3), a3				;pCurCoef++;
	
;--== Input samples and first coefficient multiplications ==--
	mac.w	d4.u, d2.l, <<, ACC0	;iOut0 += iCurA * iCurX0
	
;--== The count of inner loops calculation ==-- 
	move.l	d6, d5					;j = tmp;

			
.ForInTail2:							;do{
	
;--== Next coefficients loading	==--
	move.l	(a3)+, d4				;iCurA = *pCurCoef++; iCurB = *pCurCoef++;
	
;--== Next input samples loading	==--
	move.w	-(a4), d2				;iCurX0 = *--pCurX;
;--== Next output samples loading	==--
	move.w	-(a5), d0				;iCurY0 = *--pCurY;
	
;--== Two multiplications for each output sample ==--
	mac.w	d4.u, d2.l, <<, ACC0	;iOut0 += iCurA * iCurX0
	mac.w	d4.l, d0.l, <<, ACC0	;iOut0 += iCurB * iCurY0
	
	subq.l	#1, d5					;j--;
	bne		.ForInTail2				;}while(j);
	
.EndInTail2:	
	
;--==Storing computed sample into the memory==--
	movclr.l	ACC0, d0			;
;#ifndef __FRACT_M	
;	swap	d0						;*pY++ = iOut0;
;#endif	
	move.w	d0, (a1)+				;
	
	subq.l	#1, d7					;i--;
	bne		.BegOutTail2			;}//while(i)
	
.EndTail2:

;--==Filling history buffer==--

	move.l	8(a2), a3				;pCurHistory = IIR->pIirHistory;
	move.l	4(a2), d0				;i = IIR->iIirCoefCount;
	subq.l	#1, d0					;i--;
	move.l	d0, 12(a2)				;IIR->iIirHistoryCount = i;
	lsl.l	#1, d0					;i /= 2;
	adda.l	d0, a3					;pCurHistory += i;
									;do{
.NextHist:
	move.w	-(a0), -(a3)			;*--pCurHistory = *--pX;
	move.w	-(a1), -(a3)			;*--pCurHistory = *--pY;
	
	subq.l	#4, d0					;i += 4;
	bne		.NextHist				;}while(i);
	
;--==Restoring old MAC status register==--
	move.l	60(a7), d0
	move.l	d0, MACSR
	
	movem.l	(a7), d0-d7/a0-a6 		
	lea		64(a7), a7				
	rts
